# ==============================================================================
# file name: 02-summarize-survey-visits-YG.R
# date:	Dec 14, 2022
# author: Bernhard Clemm / Tiago Ventura 
# purpose: match visit-level data to host lists & summarize survey taking (YouGov)
# THIS SCRIPT REQUIRES ACCESS TO THE RAW DATA BASE AND SERVES FOR REFERENCE ONLY
# ==============================================================================

# Setup ========================================================================

browsing_YG <- read_rds("")
  
# The time range of the raw webtracking data is (31 March 2011 - 8 November 2018). \
# This includes 25,320,661 observations (website visits).
# We exclude all observations collected through proxy and VPN (18,865,151rows), 
# due to data quality issues. All observations prior to 17 September 2018 (6,564 rows) are treated as
# erroneous date-time setting and excluded from the subsequent analysis.

browsing_YG <- browsing_YG %>% filter(date > "2018-09-01")
  
# Read and combine host lists ==================================================
  
hosts_bevec_YG <- read.csv("data/browsing_hosts/hosts_bevec_YG.csv") %>%
    mutate(method = "bevec") 

hosts_survey_YG <- read.csv("data/browsing_hosts/hosts_survey_YG.csv") %>%
  mutate(method = "survey_url") 

# Create top 500 as union from all three data sets
hosts_500_YG_coded <- read.csv("data/browsing_hosts/hosts_500_YG_coded.csv")
hosts_500_LU_coded <- read.csv("data/browsing_hosts/hosts_500_LU_coded.csv")
hosts_500_FB_coded <- read.csv( "data/browsing_hosts/hosts_500_FB_coded.csv")

## explanation of manual coding
## code_1 is code from principal coder
## code_2 is code from second coder (for ICR only)
## categories: 1 = survey-only site; 2 = any rewards, incl survey; 0 = no survey; 99: unknown
## for main analysis, we collapse categories 1 and 2; and we only use code_1
## for robustness, we can later disaggregate categories 1/2

## bind & unique the three lists & keep only when code_1 is 1 or 2

hosts_500_all <- rbind(
  hosts_500_YG_coded %>% select(url_host, code_1),
  hosts_500_LU_coded %>% select(url_host, code_1),
  hosts_500_FB_coded %>% select(url_host, code_1)) %>%
  distinct() %>%
  filter(code_1 %in% c(1, 2)) %>%
  mutate(method = str_c("manual_labelling", code_1)) %>% 
  select(-code_1)

write.csv(hosts_500_all, paste0(path, "data/url_hosts/hosts_500_all.csv"), row.names = F)

# Create final list of all hosts representing survey sites
hosts_final <- rbind(
  hosts_bevec_YG,
  hosts_survey_YG,
  hosts_500_all %>% 
  select(url_host, method) %>% 
  rename("page_domain" = url_host))

# Identify survey visits ======================================================

# Match data to list of hosts
browsing_YG <- browsing_YG %>%
  left_join(., hosts_final %>% mutate(visit_survey = 1), by = "page_domain") %>%
  mutate(visit_survey = ifelse(is.na(visit_survey), 0, visit_survey))

# Identify visits to Google, Amazon, Facebook, Youtube for comparison
# regex approach: match URL hosts should 
# (1) end on the pattern, e.g. pattern "google.com" should not match a host "google.co"
# (2) either (a) begin with the pattern, or 
# (b) have a "." before the pattern, e.g. "news.google.com" should match but not "fakegoogle.com"

browsing_YG <- browsing_YG %>%
  mutate(
    visit_google = ifelse(
      grepl("^google\\.com$|\\.google\\.com$", .$page_domain),  1, 0),
    visit_amazon = ifelse(
      grepl("^amazon\\.com$|\\.amazon\\.com$", .$page_domain),  1, 0),
    visit_youtube = ifelse(
      grepl("^youtube\\.com$|\\.youtube\\.com$", .$page_domain),  1, 0),
    visit_facebook = ifelse(
      grepl("^facebook\\.com$|\\.facebook\\.com$", .$page_domain), 1, 0))

# Add duration variable ========================================================

browsing_YG_processed <- browsing_YG %>%
  rename("person_id" = caseid) %>%
  mutate(created_utc=str_c(date, " ", time)) %>%
  mutate(created_utc = as.POSIXct(created_utc, format = "%Y-%m-%d %H:%M:%OS")) %>%
  group_by(person_id) %>%
  mutate(created_utc_next = dplyr::lead(created_utc, order_by = created_utc)) %>%
  ungroup() %>%
  mutate(timediff = difftime(created_utc_next, created_utc, units = "secs")) %>%
  # cutoff at 300 seconds = 5 minutes; all durations > cutoff to be N
  mutate(duration_s_5_na = ifelse(timediff > 300, NA, timediff)) %>%
  # add a date variable
  mutate(created_date = as.Date(created_utc)) 

write_csv(browsing_YG_processed, "") # this is exported and reused in 03-identify-rep-participation-YG.R

# Aggregate by individual ======================================================

people_YG <- browsing_YG_processed %>%
  group_by(person_id) %>%
  summarise(n_total = n(),
            s_total_5_na = sum(duration_s_5_na, na.rm = T), 
            n_days_active = n_distinct(created_date),
            n_survey_days_active = sum(if_else(visit_survey == 1, n_distinct(created_date), 0), na.rm = T),
            n_survey = sum(visit_survey,  na.rm = T),
            n_google = sum(visit_google,  na.rm = T),
            n_youtube = sum(visit_youtube,  na.rm = T),
            n_facebook = sum(visit_facebook,  na.rm = T),
            n_amazon = sum(visit_amazon,  na.rm = T),
            s_survey_5_na = sum(if_else(visit_survey == 1, duration_s_5_na, 0),  na.rm = T),
            s_google_5_na = sum(if_else(visit_google == 1, duration_s_5_na, 0),  na.rm = T),
            s_youtube_5_na = sum(if_else(visit_youtube == 1, duration_s_5_na, 0),  na.rm = T),
            s_facebook_5_na = sum(if_else(visit_facebook == 1, duration_s_5_na, 0),  na.rm = T),
            s_amazon_5_na = sum(if_else(visit_amazon == 1, duration_s_5_na, 0),  na.rm = T)) %>%
  mutate(across(contains("survey"), ~ ifelse(is.na(.), 0, .)))

# Add count for each survey method =============================================

browsing_YG_method <- browsing_YG_processed %>%
    group_by(person_id, method) %>%
    summarise(n_survey = sum(visit_survey)) %>%
    # remove non-survey visits
    filter(!is.na(method)) %>%
    ungroup() %>%
    pivot_wider(id_cols = person_id, 
                names_from = method, 
                values_from = n_survey, 
                names_prefix = "n_") %>%
    mutate_at(vars(contains("n_")), ~ case_when(is.na(.) ~ 0,  TRUE ~ .))

people_YG <- left_join(people_YG, browsing_YG_method)

# Add predominant device variable ==============================================

people_device_YG <- browsing_YG_processed %>%
  mutate(device_type = ifelse(device_type == "Laptop/Desktop", "Laptop", "Mobile")) %>%
  count(person_id, device_type) %>%
  group_by(person_id) %>%
  mutate(total = sum(n), 
         share_devices = n/total) %>%
  drop_na() %>%
  arrange(device_type, share_devices) 

people_device_YG_w <- people_device_YG %>%
  select(person_id, device_type, n, share_devices) %>%
  pivot_wider(names_from = device_type, 
              values_from = c(n, share_devices)) %>%
  clean_names() %>%
  mutate(device_main = case_when(share_devices_laptop > .9 ~ "Desktop",
                                 share_devices_mobile > .9 ~ "Mobile", 
                                 TRUE ~ "Desktop/Mobile"))

people_YG <- left_join(people_YG, people_device_YG_w %>% select(person_id, device_main))

write_csv(people_YG, "data/browsing_summarized/people_visits_YG.csv")

# Top-10 most popular survey hosts =============================================

hosts_popular_YG <- browsing_YG %>%
  filter(visit_survey == 1) %>%
  group_by(page_domain) %>%
  count() %>%
  mutate(dataset = "YouGov") %>%
  select(url_host = page_domain, ct = n, dataset)

write_csv(hosts_popular_YG, "data/raw_to_processed/url_hosts/hosts_popular_YG.csv")
